import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
imports
def down_sample_textbook(df):
df_majority = df[df.is_fraud==0].copy()
df_minority = df[df.is_fraud==1].copy()
df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
return df_downsampled
def compute_time_difference(group):
n = len(group)
result = []
for i in range(n):
for j in range(n):
time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
return result
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain| trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
| 1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
| 2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
| 3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
| 4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
| 1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
| 1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
| 1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
| 1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
데이터정리
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape(214520, 22)
df50 = down_sample_textbook(df02)
df50 = df50.reset_index()
df50.shape(12012, 23)
tr/test
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.shape, df50_test.shape((9009, 23), (3003, 23))
N = len(df50)
train_mask = [i in df50_tr.index for i in range(N)]
test_mask = [i in df50_test.index for i in range(N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
train_mask.sum(), test_mask.sum()(9009, 3003)
train_mask.shape, test_mask.shape((12012,), (12012,))
edge_index 설정
# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)edge_index = np.load('edge_index_list_plus50.npy')
edge_index.shape(200706, 3)
theta = edge_index[:,2].mean()
edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index = edge_index.tolist()
mean_ = np.array(edge_index)[:,2].mean()
mean_0.5098736436405648
edge_index[:5][[1023.0, 1023.0, 0.0],
[1023.0, 1024.0, 0.9994677478343093],
[1023.0, 1028.0, 0.9902065900321946],
[1023.0, 1031.0, 0.97983815585674],
[1023.0, 1032.0, 0.97983815585674]]
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shapetorch.Size([2, 93730])
data설정(x, edge_index, y)
x = torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df50['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
dataData(x=[12012, 1], edge_index=[2, 93730], y=[12012], train_mask=[12012], test_mask=[12012])
정리
| 구분 | Train | Test | 모형 | 설명변수 | 비고 |
|---|---|---|---|---|---|
| 분석1 | df50_tr | df50_test | GNN | amt | |
| 분석2 | df50_tr | df50_test | 로지스틱 회귀 | amt | |
| 분석3 | df50_tr | df50_test | SVM | amt | |
| 분석4 | df50_tr | df50_test | 랜덤포레스트 | amt | |
| 분석5 | df50_tr | df50_test | 부스팅 | amt | |
| 분석6 | df50_tr | df50_test | Naive Bayes | amt |
lst = [_results1, _results2,_results3,_results4,_results5, _results6]
pd.concat(lst)| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석1 | 0.889111 | 0.865884 | 0.923533 | 0.893780 |
| 분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
| 분석3 | 0.850150 | 0.935510 | 0.755438 | 0.835886 |
| 분석4 | 0.847153 | 0.850331 | 0.846407 | 0.848365 |
| 분석5 | 0.880120 | 0.886957 | 0.874094 | 0.880478 |
| 분석6 | 0.857143 | 0.957143 | 0.750824 | 0.841522 |
분석 1(GNN)
X = (data.x[data.train_mask]).numpy()
XX = (data.x[data.test_mask]).numpy()
y = (data.y[data.train_mask]).numpy()
yy = (data.y[data.test_mask]).numpy()model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
model.eval()GCN(
(conv1): GCNConv(1, 16)
(conv2): GCNConv(16, 2)
)
pred = model(data).argmax(dim=1)
yyhat = pred[data.test_mask]metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results1= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석1 | 0.889111 | 0.865884 | 0.923533 | 0.89378 |
분석2(로지스틱 회귀)
X = np.array(df50_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)lrnr = sklearn.linear_model.LogisticRegression()lrnr.fit(X,y)LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
#thresh = y.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
yyhat = lrnr.predict(XX) yyhatarray([0, 1, 0, ..., 0, 0, 1])
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
분석3(서포트 벡터 머신)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)lrnr = SVC(kernel='linear')
lrnr.fit(X,y)
yyhat = lrnr.predict(XX)metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results3= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석3 | 0.85015 | 0.93551 | 0.755438 | 0.835886 |
분석4(랜덤 포레스트)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)lrnr = RandomForestClassifier()
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results4= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석4'])
_results4| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석4 | 0.847153 | 0.850331 | 0.846407 | 0.848365 |
분석5(부스팅)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)lrnr = xgb.XGBClassifier()
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results5= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석5'])
_results5| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석5 | 0.88012 | 0.886957 | 0.874094 | 0.880478 |
분석6(Naive Bayes)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)lrnr = GaussianNB()
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]_results6= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석6'])
_results6| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석6 | 0.857143 | 0.957143 | 0.750824 | 0.841522 |